Project

Project

Introduction

Problem Statement

Data Collection

#Importing requied libraries
library(readr)
library(tidyverse)
library(tidymodels)
library(ggplot2)
library(dplyr)
library(caret)
library(e1071)
library(rpart)

crop <- read.csv2("Cropdata.csv", header = TRUE, sep = ",")
View(crop)
str(crop)
## 'data.frame':    902 obs. of  7 variables:
##  $ Time.line: Factor w/ 5 levels "2014-2015","2015-2016",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ PH       : num  7.86 7.71 8.01 7.83 8.11 7.53 8.11 7.3 7.69 7.53 ...
##  $ EC       : num  0.931 0.694 1.11 1.09 1.14 1.02 1.14 1.02 0.921 1.02 ...
##  $ N        : num  168 100 91 100 82 ...
##  $ P        : num  17 19.9 16 19.9 17 19.9 17 19.9 17 19.9 ...
##  $ k        : num  32 43.6 99 43.6 102 43.6 1.2 43.6 36 43.6 ...
##  $ Total    : num  226 172 215 173 210 ...
head(crop)
##   Time.line   PH    EC     N    P     k   Total
## 1 2014-2015 7.86 0.931 168.0 17.0  32.0 225.791
## 2 2014-2015 7.71 0.694 100.1 19.9  43.6 172.004
## 3 2014-2015 8.01 1.110  91.0 16.0  99.0 215.120
## 4 2014-2015 7.83 1.090 100.1 19.9  43.6 172.520
## 5 2014-2015 8.11 1.140  82.0 17.0 102.0 210.250
## 6 2014-2015 7.53 1.020 100.1 19.9  43.6 172.150
summary(crop)
##      Time.line         PH                EC                 N          
##  2014-2015:219   Min.   :   0.36   Min.   :  0.0090   Min.   :   1.81  
##  2015-2016:154   1st Qu.:   7.72   1st Qu.:  0.8152   1st Qu.: 100.10  
##  2017-2018:224   Median :   7.96   Median :  1.0050   Median : 152.00  
##  2018-2019:114   Mean   :  18.70   Mean   :  7.2213   Mean   : 139.50  
##  2019-2020:191   3rd Qu.:   8.10   3rd Qu.:  1.0900   3rd Qu.: 169.00  
##                  Max.   :7388.00   Max.   :952.0000   Max.   :1725.00  
##        P                 k              Total       
##  Min.   :   0.13   Min.   :  0.20   Min.   :  51.9  
##  1st Qu.:  14.00   1st Qu.: 42.00   1st Qu.: 172.7  
##  Median :  17.00   Median : 43.60   Median : 224.0  
##  Mean   :  26.37   Mean   : 57.85   Mean   : 249.4  
##  3rd Qu.:  19.90   3rd Qu.: 81.00   3rd Qu.: 282.4  
##  Max.   :1282.00   Max.   :641.00   Max.   :7552.6
crop$Total <- round(crop$Total,0)

#**************************************************Step_1*******************************************
#The first step is create two new columns as follows:
# Categories in grade coloumn- Converting grades into low or high risk
crop_new <- mutate(crop, 
                 Crop_Type = case_when(Total %in% 1:200 ~ "Ground Nut",  
                                       Total %in%  200:214  ~ "Sugar Cane",
                                       Total %in% 215:235 ~ "Grape",
                                       Total %in% 236:244  ~ "Onion",
                                       Total %in% 245:250 ~ "Banana",
                                       Total  %in%  251:100000 ~ "Turmeric"))



#Creating a csv file 
write.table(crop_new, file = "crop_new.csv",
            sep = ",",
            row.names = FALSE)
View(crop_new)

Data Preparation

sample_set <- sample(2, nrow(crop_new), 
              replace = TRUE, 
              prob = c(0.7, 0.3))
train <- crop_new[sample_set==1,]
head(train)
##   Time.line   PH    EC     N    P     k Total  Crop_Type
## 2 2014-2015 7.71 0.694 100.1 19.9  43.6   172 Ground Nut
## 3 2014-2015 8.01 1.110  91.0 16.0  99.0   215      Grape
## 4 2014-2015 7.83 1.090 100.1 19.9  43.6   173 Ground Nut
## 5 2014-2015 8.11 1.140  82.0 17.0 102.0   210 Sugar Cane
## 7 2014-2015 8.11 1.140  82.0 17.0   1.2   109 Ground Nut
## 9 2014-2015 7.69 0.921  76.0 17.0  36.0   138 Ground Nut
#Creating a csv file 
write.table(train, file = "crop_train.csv",
            sep = ",",
            row.names = FALSE)



test <- crop_new[sample_set==2,]
head(test)
##    Time.line   PH    EC     N    P     k Total  Crop_Type
## 1  2014-2015 7.86 0.931 168.0 17.0  32.0   226      Grape
## 6  2014-2015 7.53 1.020 100.1 19.9  43.6   172 Ground Nut
## 8  2014-2015 7.30 1.020 100.1 19.9  43.6   172 Ground Nut
## 13 2014-2015 8.39 6.000 100.1 19.9  43.6   178 Ground Nut
## 18 2014-2015 8.14 1.120  93.0 17.0 112.0   231      Grape
## 22 2014-2015 8.11 1.100  83.0 17.0 115.0   224      Grape
#Creating a csv file 
write.table(crop_new, file = "test.csv",
            sep = ",",
            row.names = FALSE)

Data Cleaning

library(DataExplorer)
sum(is.na(train))
## [1] 0
sum(is.na(test))
## [1] 0
plot_missing(train)

# Exploratory Data Analysis (EDA) * describe - can computes the statistics of all numerical variables

library(Hmisc)
## Loading required package: survival
## 
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
## 
##     cluster
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following object is masked from 'package:e1071':
## 
##     impute
## The following object is masked from 'package:parsnip':
## 
##     translate
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
describe(train)
## train 
## 
##  8  Variables      629  Observations
## --------------------------------------------------------------------------------
## Time.line 
##        n  missing distinct 
##      629        0        5 
## 
## lowest : 2014-2015 2015-2016 2017-2018 2018-2019 2019-2020
## highest: 2014-2015 2015-2016 2017-2018 2018-2019 2019-2020
##                                                             
## Value      2014-2015 2015-2016 2017-2018 2018-2019 2019-2020
## Frequency        166        94       159        79       131
## Proportion     0.264     0.149     0.253     0.126     0.208
## --------------------------------------------------------------------------------
## PH 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      629        0      155    0.999    11.62    7.951    7.310    7.580 
##      .25      .50      .75      .90      .95 
##    7.720    7.950    8.100    8.182    8.600 
## 
## lowest :   0.36000   1.12000   1.15000   2.06000   2.72000
## highest:  18.60000  18.70026 765.00000 768.00000 822.00000
##                                               
## Value          0    10    20   760   770   820
## Frequency      6   618     2     1     1     1
## Proportion 0.010 0.983 0.003 0.002 0.002 0.002
## 
## For the frequency table, variable is rounded to the nearest 10
## --------------------------------------------------------------------------------
## EC 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      629        0      229    0.999    3.722    5.939    0.170    0.328 
##      .25      .50      .75      .90      .95 
##    0.831    0.994    1.090    1.282    1.840 
## 
## lowest :   0.009   0.040   0.050   0.060   0.070
## highest: 101.000 124.000 168.000 248.000 951.000
##                                                                 
## Value          0    10    20    80   100   120   170   250   950
## Frequency    612    10     1     1     1     1     1     1     1
## Proportion 0.973 0.016 0.002 0.002 0.002 0.002 0.002 0.002 0.002
## 
## For the frequency table, variable is rounded to the nearest 10
## --------------------------------------------------------------------------------
## N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      629        0       84    0.938      139    45.78    100.1    100.1 
##      .25      .50      .75      .90      .95 
##    100.1    152.0    169.0    179.0    189.0 
## 
## lowest :    1.81    7.87    8.00   15.00   15.40
## highest:  196.00  197.00  198.00  199.00 1725.00
## --------------------------------------------------------------------------------
## P 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      629        0       44    0.963    28.32    26.98      9.0     10.8 
##      .25      .50      .75      .90      .95 
##     14.0     17.0     19.9     19.9    153.0 
## 
## lowest :    0.13    1.00    5.00    7.00    8.00
## highest:  171.00  172.00  173.00  178.00 1282.00
## --------------------------------------------------------------------------------
## k 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      629        0       94    0.966    56.33    30.67     14.0     25.0 
##      .25      .50      .75      .90      .95 
##     41.0     43.6     81.0     95.0     99.0 
## 
## lowest :   0.2   1.2   4.0   6.0   7.0, highest: 122.0 129.0 146.0 160.0 196.0
## --------------------------------------------------------------------------------
## Total 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      629        0      156    0.992      239    79.29    172.0    172.0 
##      .25      .50      .75      .90      .95 
##    173.0    223.0    281.0    296.0    307.6 
## 
## lowest :   78   98  106  109  113, highest: 1045 1079 1122 1397 1857
## --------------------------------------------------------------------------------
## Crop_Type 
##        n  missing distinct 
##      629        0        6 
## 
## lowest : Banana     Grape      Ground Nut Onion      Sugar Cane
## highest: Grape      Ground Nut Onion      Sugar Cane Turmeric  
##                                                                             
## Value          Banana      Grape Ground Nut      Onion Sugar Cane   Turmeric
## Frequency          13         70        233         21         46        246
## Proportion      0.021      0.111      0.370      0.033      0.073      0.391
## --------------------------------------------------------------------------------
describe(test)
## test 
## 
##  8  Variables      273  Observations
## --------------------------------------------------------------------------------
## Time.line 
##        n  missing distinct 
##      273        0        5 
## 
## lowest : 2014-2015 2015-2016 2017-2018 2018-2019 2019-2020
## highest: 2014-2015 2015-2016 2017-2018 2018-2019 2019-2020
##                                                             
## Value      2014-2015 2015-2016 2017-2018 2018-2019 2019-2020
## Frequency         53        60        65        35        60
## Proportion     0.194     0.220     0.238     0.128     0.220
## --------------------------------------------------------------------------------
## PH 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      273        0      104    0.999    35.02    54.81    7.306    7.592 
##      .25      .50      .75      .90      .95 
##    7.720    7.960    8.100    8.250    8.796 
## 
## lowest :    0.46000    0.81000    3.00000    6.08100    6.34000
## highest:    9.23000    9.91000   18.60000   18.70026 7388.00000
##                       
## Value          0  7400
## Frequency    272     1
## Proportion 0.996 0.004
## 
## For the frequency table, variable is rounded to the nearest 100
## --------------------------------------------------------------------------------
## EC 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      273        0      132    0.999    15.28     28.7    0.168    0.350 
##      .25      .50      .75      .90      .95 
##    0.810    1.010    1.090    1.288    3.104 
## 
## lowest :   0.009   0.070   0.080   0.090   0.110
## highest: 151.000 691.000 898.000 921.000 952.000
##                                                           
## Value          0    10   120   150   690   900   920   950
## Frequency    261     5     1     2     1     1     1     1
## Proportion 0.956 0.018 0.004 0.007 0.004 0.004 0.004 0.004
## 
## For the frequency table, variable is rounded to the nearest 10
## --------------------------------------------------------------------------------
## N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      273        0       58    0.949    140.7    40.22    100.1    100.1 
##      .25      .50      .75      .90      .95 
##    100.1    156.0    169.0    181.0    189.4 
## 
## lowest :   8  78  83  84  88, highest: 194 195 196 198 275
## --------------------------------------------------------------------------------
## P 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      273        0       24    0.958    21.87    14.32      9.6     12.0 
##      .25      .50      .75      .90      .95 
##     14.0     17.0     19.9     19.9     19.9 
## 
## lowest :   7   8   9  10  11, highest: 163 164 168 171 172
## --------------------------------------------------------------------------------
## k 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      273        0       70    0.961    61.34    34.08     23.6     34.0 
##      .25      .50      .75      .90      .95 
##     43.6     43.6     81.0     96.0    105.6 
## 
## lowest :   0.87   1.00  12.00  14.00  15.00, highest: 128.00 130.00 144.00 175.00 641.00
## --------------------------------------------------------------------------------
## Total 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      273        0      106    0.993    273.4      141    172.0    172.0 
##      .25      .50      .75      .90      .95 
##    173.0    225.0    284.0    302.6    315.0 
## 
## lowest :   52  165  171  172  173, highest:  908 1100 1112 1152 7553
##                                                                       
## Value        100   200   300   400   500   800   900  1100  1200  7600
## Frequency      1   156   108     1     1     1     1     2     1     1
## Proportion 0.004 0.571 0.396 0.004 0.004 0.004 0.004 0.007 0.004 0.004
## 
## For the frequency table, variable is rounded to the nearest 100
## --------------------------------------------------------------------------------
## Crop_Type 
##        n  missing distinct 
##      273        0        6 
## 
## lowest : Banana     Grape      Ground Nut Onion      Sugar Cane
## highest: Grape      Ground Nut Onion      Sugar Cane Turmeric  
##                                                                             
## Value          Banana      Grape Ground Nut      Onion Sugar Cane   Turmeric
## Frequency           4         33         98          5         17        116
## Proportion      0.015      0.121      0.359      0.018      0.062      0.425
## --------------------------------------------------------------------------------
  • Two continuous variables
  • Taking PH & EC
library(ggplot2)

q <- ggplot(data = train, aes(x =Time.line , y = log(PH)  ))+
   geom_line(colour = "darkgreen") + 
  geom_point(aes(colour = factor(Crop_Type)), size =3) +
  geom_point(colour = "grey90", size = 1.5)+
  labs(title = 'Crop according to PH for  Time.line 2015-2020',
       y='PH of the soil',x='Time.line')
q

library(plotly)
 
fig <- train %>%
  plot_ly(
    x = ~log(PH), 
    y = ~log(P), 
    size = ~k, 
    color = ~Crop_Type, 
    frame = ~Time.line, 
    text = ~P, 
    hoverinfo = "text",
    type = 'scatter',
    mode = 'markers'
    
  )

fig <- fig %>% layout(
  xaxis = list(
    type = "log"
  )
)

fig
plot_ly(train, x = ~log(PH), y = ~Crop_Type , 
        type = 'scatter', 
        mode = 'markers',
        marker = list(color = "darkgreen" ),  opacity = 0.5) %>%  
  layout(title = 'Crop according to PH for  Time.line 2015-2020', 
                       yaxis = list(title = 'Time.line'), 
                       xaxis = list(title = 'PH of the soil ') )

Boosting Algorithms

train$Crop_Type <- as.factor(train$Crop_Type)
library(mlbench)
library(caret)

# Example of Boosting Algorithms
control <- trainControl(method="repeatedcv", number=10, repeats=3)
seed <- 7
metric <- "Accuracy"

Modelling

SvmRadial

set.seed(seed)
fit.svmRadial <- train(Crop_Type~., data=train, method="svmRadial", metric=metric, trControl=control)
fit.svmRadial
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 629 samples
##   7 predictor
##   6 classes: 'Banana', 'Grape', 'Ground Nut', 'Onion', 'Sugar Cane', 'Turmeric' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 566, 566, 564, 567, 565, 567, ... 
## Resampling results across tuning parameters:
## 
##   C     Accuracy   Kappa    
##   0.25  0.8245223  0.7336092
##   0.50  0.8340136  0.7524886
##   1.00  0.8542609  0.7839876
## 
## Tuning parameter 'sigma' was held constant at a value of 0.3331884
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.3331884 and C = 1.

Stochastic Gradient Boosting

# Stochastic Gradient Boosting
set.seed(seed)
fit.gbm <- train(Crop_Type~., data=train, method="gbm", metric=metric, trControl=control, verbose=FALSE)
fit.gbm
## Stochastic Gradient Boosting 
## 
## 629 samples
##   7 predictor
##   6 classes: 'Banana', 'Grape', 'Ground Nut', 'Onion', 'Sugar Cane', 'Turmeric' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 566, 566, 564, 567, 565, 567, ... 
## Resampling results across tuning parameters:
## 
##   interaction.depth  n.trees  Accuracy   Kappa    
##   1                   50      0.9973620  0.9961856
##   1                  100      0.9973620  0.9961955
##   1                  150      0.9968411  0.9954484
##   2                   50      0.9968497  0.9954635
##   2                  100      0.9957912  0.9939446
##   2                  150      0.9957912  0.9939446
##   3                   50      0.9973708  0.9962215
##   3                  100      0.9958080  0.9939753
##   3                  150      0.9952704  0.9932036
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 50, interaction.depth =
##  3, shrinkage = 0.1 and n.minobsinnode = 10.

kNN

# kNN
set.seed(seed)
fit.knn <- train(Crop_Type~., data=train, method="knn", metric=metric, preProc=c("center", "scale"), trControl=control)
fit.knn
## k-Nearest Neighbors 
## 
## 629 samples
##   7 predictor
##   6 classes: 'Banana', 'Grape', 'Ground Nut', 'Onion', 'Sugar Cane', 'Turmeric' 
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 566, 566, 564, 567, 565, 567, ... 
## Resampling results across tuning parameters:
## 
##   k  Accuracy   Kappa    
##   5  0.8928668  0.8448428
##   7  0.8807039  0.8269661
##   9  0.8780339  0.8219489
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 5.

Random Forest

# Random Forest
set.seed(seed)
fit.rf <- train(Crop_Type~., data=train, method="rf", metric=metric, trControl=control)
fit.rf
## Random Forest 
## 
## 629 samples
##   7 predictor
##   6 classes: 'Banana', 'Grape', 'Ground Nut', 'Onion', 'Sugar Cane', 'Turmeric' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 566, 566, 564, 567, 565, 567, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.9649931  0.9492937
##    6    0.9936324  0.9907591
##   10    0.9984207  0.9977162
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 10.

Model Selection

summarize results

# summarize results
boosting_results <- resamples(list(svmRadial=fit.svmRadial, gbm=fit.gbm, knn =fit.knn, rf =fit.rf))
boosting_results
## 
## Call:
## resamples.default(x = list(svmRadial = fit.svmRadial, gbm = fit.gbm, knn
##  = fit.knn, rf = fit.rf))
## 
## Models: svmRadial, gbm, knn, rf 
## Number of resamples: 30 
## Performance metrics: Accuracy, Kappa 
## Time estimates for: everything, final model fit
summary(boosting_results)
## 
## Call:
## summary.resamples(object = boosting_results)
## 
## Models: svmRadial, gbm, knn, rf 
## Number of resamples: 30 
## 
## Accuracy 
##               Min.   1st Qu.    Median      Mean   3rd Qu.     Max. NA's
## svmRadial 0.812500 0.8387097 0.8571429 0.8542609 0.8704389 0.890625    0
## gbm       0.983871 1.0000000 1.0000000 0.9973708 1.0000000 1.000000    0
## knn       0.812500 0.8730159 0.8977667 0.8928668 0.9058780 0.952381    0
## rf        0.983871 1.0000000 1.0000000 0.9984207 1.0000000 1.000000    0
## 
## Kappa 
##                Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## svmRadial 0.7261056 0.7622753 0.7879185 0.7839876 0.8056046 0.8391960    0
## gbm       0.9768484 1.0000000 1.0000000 0.9962215 1.0000000 1.0000000    0
## knn       0.7305263 0.8157818 0.8495089 0.8448428 0.8641996 0.9292929    0
## rf        0.9765329 1.0000000 1.0000000 0.9977162 1.0000000 1.0000000    0
dotplot(boosting_results)

# Bagging Algorithms